Code
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)Welcome to my presentation! Today I’m going to showcase what I’ve been doing over the summer. This is the culmination of most of my work, I welcome and encourage feedback before my poster presentation!
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)compare_vcfs = function(quilt_input,geno_input,census_input,x_order){
census_input = census_input[order(census_input$V1),]
quilt_input$READS = census_input$V2
meltedbar = subset(quilt_input, select = c(DONOR,READS,COUNT))
meltedbar = melt(meltedbar,id="DONOR")
colnames(meltedbar) <- c('Donor','Filtered','Reads')
p1 <- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
geom_bar(position="identity",stat="identity") +
scale_fill_manual(values=c("#D41159", "#69C561")) +
scale_x_discrete(limits = x_order) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(
panel.background = element_rect(fill='transparent'), #transparent panel bg
plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
panel.grid.major = element_blank(), #remove major gridlines
panel.grid.minor = element_blank(), #remove minor gridlines
legend.background = element_rect(fill='transparent'), #transparent legend bg
legend.box.background = element_rect(fill='transparent') #transparent legend panel
)
quilt_input$G_REP = geno_input$REPRESENTATION
quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
meltedline = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline = melt(meltedline,id="DONOR")
colnames(meltedline) <- c('Donor','VCF','Differences')
p2 <- ggplot(data = meltedline) +
geom_hline(yintercept=0,size=0.5)+
geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(
panel.background = element_rect(fill='transparent'), #transparent panel bg
plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
panel.grid.major = element_blank(), #remove major gridlines
panel.grid.minor = element_blank(), #remove minor gridlines
legend.background = element_rect(fill='transparent'), #transparent legend bg
legend.box.background = element_rect(fill='transparent') #transparent legend panel
)
quilt_input$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
meltedline_error = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline_error = melt(meltedline_error,id="DONOR")
colnames(meltedline_error) <- c('Donor','VCF','PercentError')
p3 <- ggplot(data = meltedline_error) +
geom_hline(yintercept=0,size=0.5)+
geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(
panel.background = element_rect(fill='transparent'), #transparent panel bg
plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
panel.grid.major = element_blank(), #remove major gridlines
panel.grid.minor = element_blank(), #remove minor gridlines
legend.background = element_rect(fill='transparent'), #transparent legend bg
legend.box.background = element_rect(fill='transparent') #transparent legend panel
)
stacked_plots <- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
}log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
hold = log_census[order(log_census$V2),]
hold = hold$V1equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(equal_quilt,equal_geno,equal_census,hold)Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
The bottom bargraph represents the proportion of reads for each donor within the “pool” of samples. The total height of the bar represents the reads per sample inputted into Census-seq, but the red on top represents reads which did not pass the quality control. The line graph on top represents the difference between the true proprotion and the estimated proportions. Dots over the line are overestimations and dots under the line are underestimations.
log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(log_quilt,log_geno,log_census,hold)random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(random_quilt,random_geno,random_census,hold)# library(reticulate)
# conda_create(envname = "myenv")
# conda_install( envname = "myenv", packages = c("seaborn","pandas","matplotlib","plotly"))
# use_condaenv(condaenv = "myenv",required = TRUE )# import os
# import numpy as np
# import seaborn as sb
# import pandas as pd
# import matplotlib.pyplot as plt
# import plotly.express as px
#
# sample_amounts = []
# read_numbers = []
# percent_errors = []
# proportion_types = []
#
# def calculate_pe(df):
# df['PERCENT_ERROR'] = abs(df['REPRESENTATION'] - df['KNOWN']) / df['KNOWN'] * 100
# av_percent_error = df['PERCENT_ERROR'].mean()
# return av_percent_error
#
# for root,dirs,files, in os.walk('/flashscratch/munger-rea/bwa-mem/'):
# for file in files:
# if file == "my.census.txt":
# file_path = os.path.join(root,file)
# print(file_path)
#
# path_parts = root.split('/')
# sample_amount = path_parts[-4]
# proportion_type = path_parts[-3]
# read_number = path_parts[-2]
#
# df = pd.read_csv(os.path.join(root, file), sep="\t",skiprows=2)
# percent_error = calculate_pe(df)
# sample_amounts.append(sample_amount)
# read_numbers.append(read_number)
# percent_errors.append(percent_error)
# proportion_types.append(proportion_type)
#
# percent_errors = [float(i) for i in percent_errors]
#
# reads_to_numbers = {
# 'h_thousand': 100000,
# 'o_million': 1000000,
# 't_million': 10000000,
# 'f_million': 50000000,
# 'h_million': 100000000
# }
#
# read_numbers = [reads_to_numbers[i] for i in read_numbers]
#
# samples_to_numbers = {
# 's4': 4,
# 's8': 8,
# 's16': 16,
# 's32': 32,
# 's48': 48
# }
#
# sample_amounts = [samples_to_numbers[i] for i in sample_amounts]
#
# df = pd.DataFrame({
# 'Reads': read_numbers,
# 'Samples': sample_amounts,
# 'Percent_Errors': percent_errors,
# 'Proportion_Type': proportion_types
# })
#
# fig = px.scatter_3d(df, x='Samples', y='Reads', z='Percent_Errors',
# color='Proportion_Type')
# fig.show()sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
sample_amount <- path_parts[length(path_parts) - 4]
proportion_type <- path_parts[length(path_parts) - 3]
read_number <- path_parts[length(path_parts) - 2]
df <- read_tsv(file_path, skip = 2)
percent_error <- calculate_pe(df)
sample_amounts <- c(sample_amounts, sample_amount)
read_numbers <- c(read_numbers, read_number)
percent_errors <- c(percent_errors, percent_error)
proportion_types <- c(proportion_types, proportion_type)
}
percent_errors <- as.numeric(percent_errors)
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
read_numbers <- sapply(read_numbers, function(x) reads_to_numbers[[x]])
samples_to_numbers <- list(
's4' = 4,
's8' = 8,
's16' = 16,
's32' = 32,
's48' = 48
)
sample_amounts <- sapply(sample_amounts, function(x) samples_to_numbers[[x]])
df <- data.frame(
'Reads' = read_numbers,
'Samples' = sample_amounts,
'Percent_Error' = percent_errors,
'Proportion_Type' = proportion_types
)
fig <- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#648FFF','#DC267F','#FFB000'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
yaxis = list(title = 'Number of Reads'),
zaxis = list(title = 'Percent Error')))
figlibrary(dplyr)
library(readr)
library(plotly)
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
figlibrary(dplyr)
library(readr)
library(plotly)
library(gapminder)
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
figbowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = (abs(bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = (abs(bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = (abs(bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = (abs(bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie = ((bwa$bowtie - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df)#meltedviolin = melt(meltedviolin,id="DONOR")
#colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
# heatmap_plot = ggplot(data=meltedviolin, aes(x=Donor,y=Aligner)) +
# geom_tile(aes(fill=PercentError)) +
# scale_fill_gradient2() +
# theme(axis.text.y = element_text(size = 6)) +
# theme_minimal()bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION - STAR$KNOWN
bwa$mini = mini$REPRESENTATION - mini$KNOWN
bwa$bwa = bwa$REPRESENTATION - bwa$KNOWN
bwa$bowtie = bowtie$REPRESENTATION - bowtie$KNOWN
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df)files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2files <- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
theme_minimal()
p1 + p2library(plotly)
dens <- with(diamonds, tapply(price, INDEX = cut, density))
data <- data.frame(
x = unlist(lapply(dens, "[[", "x")),
y = unlist(lapply(dens, "[[", "y")),
cut = rep(names(dens), each = length(dens[[1]]$x)))
fig <- plot_ly(data, x = ~x, y = ~y, z = ~cut, type = 'scatter3d', mode = 'lines', color = ~cut)
fig